homes <- read_csv("slo_homes.csv") %>%
janitor::clean_names() %>%
filter(city %in% c("San Luis Obispo", "Atascadero", "Arroyo Grande"))
## Parsed with column specification:
## cols(
## City = col_character(),
## Price = col_double(),
## Bedrooms = col_double(),
## Bathrooms = col_double(),
## SqFt = col_double(),
## PricePerSqFt = col_double(),
## Status = col_character()
## )
Look at correlations between numeric variables (checking to see if we think collinearity might be a concern). Ignore price_per_sq_ft since it’s a calculated metric.
# Creates correlation matrix:
homes_cor <- cor(homes[2:6])
homes_cor
## price bedrooms bathrooms sq_ft price_per_sq_ft
## price 1.0000000 0.31152578 0.4991453 0.6618575 0.73664029
## bedrooms 0.3115258 1.00000000 0.7519186 0.6960233 -0.09994633
## bathrooms 0.4991453 0.75191865 1.0000000 0.8046112 0.07624770
## sq_ft 0.6618575 0.69602326 0.8046112 1.0000000 0.12549708
## price_per_sq_ft 0.7366403 -0.09994633 0.0762477 0.1254971 1.00000000
Make a correlogram (plot of correlations):
corrplot(homes_cor)
beep(2)
Start with a complete model including all variables for now:
# linear model with outcome as price (assumes no interaction)
home_lm <- lm(price ~ city + bedrooms + bathrooms + sq_ft + status,
data = homes)
home_lm
##
## Call:
## lm(formula = price ~ city + bedrooms + bathrooms + sq_ft + status,
## data = homes)
##
## Coefficients:
## (Intercept) cityAtascadero citySan Luis Obispo
## 184130.9 -167396.4 31018.1
## bedrooms bathrooms sq_ft
## -161645.5 48692.0 389.1
## statusRegular statusShort Sale
## 303964.6 -19828.6
Let’s interpret the model results.
(Tip: A good guideline is at least 20 samples on each variable level you include in the regression.)
home_lm2 <- lm(price ~ city + sq_ft + status,
data = homes)
home_lm2
##
## Call:
## lm(formula = price ~ city + sq_ft + status, data = homes)
##
## Coefficients:
## (Intercept) cityAtascadero citySan Luis Obispo
## -105821 -182587 70279
## sq_ft statusRegular statusShort Sale
## 325 315441 31284
On average, home price increases by $325 for every 1 sq ft increase in home size. (This is still pretty pricey, but makes sense for the central coast.)
AIC values: This tells you about the balance between model fit and model complexity. Better model fit indicated by lower AIC value.
AIC(home_lm) # values not meaningful on their own, but useful in comparison...
## [1] 3576.834
AIC(home_lm2)
## [1] 3584.3
# Be careful: AIC is not useful on its own! Consider your conceptual evidence from your interpretation of the data.
Let’s check out the adjusted R-squared value to see model fit.
summary(home_lm) # don't throw out coefficients based on significance alone!
##
## Call:
## lm(formula = price ~ city + bedrooms + bathrooms + sq_ft + status,
## data = homes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -843938 -115963 -12298 109718 3445077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 184130.93 143368.84 1.284 0.20157
## cityAtascadero -167396.39 78701.95 -2.127 0.03552 *
## citySan Luis Obispo 31018.14 114895.10 0.270 0.78766
## bedrooms -161645.51 49414.32 -3.271 0.00141 **
## bathrooms 48692.04 52408.63 0.929 0.35476
## sq_ft 389.12 53.17 7.318 3.43e-11 ***
## statusRegular 303964.64 105942.81 2.869 0.00488 **
## statusShort Sale -19828.56 86335.35 -0.230 0.81875
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 380600 on 117 degrees of freedom
## Multiple R-squared: 0.5682, Adjusted R-squared: 0.5424
## F-statistic: 22 on 7 and 117 DF, p-value: < 2.2e-16
summary(home_lm2) # simpler model, with coefficients that make sense, but slightly smaller adjusted R-squared
##
## Call:
## lm(formula = price ~ city + sq_ft + status, data = homes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -653118 -120914 -8844 101402 3644738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105820.71 118697.40 -0.892 0.3745
## cityAtascadero -182586.90 80683.44 -2.263 0.0254 *
## citySan Luis Obispo 70278.97 115846.31 0.607 0.5452
## sq_ft 325.03 31.54 10.306 <2e-16 ***
## statusRegular 315441.26 109749.65 2.874 0.0048 **
## statusShort Sale 31284.44 87356.11 0.358 0.7209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 395100 on 119 degrees of freedom
## Multiple R-squared: 0.5268, Adjusted R-squared: 0.5069
## F-statistic: 26.49 on 5 and 119 DF, p-value: < 2.2e-16
Let’s ask stargazer package to create the summary regression table for us! Such a nightmare by hand.
# results 'asis' because stargazer outputs to HTML already
stargazer(home_lm, home_lm2, type = "html")
| Dependent variable: | ||
| price | ||
| (1) | (2) | |
| cityAtascadero | -167,396.400** | -182,586.900** |
| (78,701.950) | (80,683.440) | |
| citySan Luis Obispo | 31,018.140 | 70,278.970 |
| (114,895.100) | (115,846.300) | |
| bedrooms | -161,645.500*** | |
| (49,414.320) | ||
| bathrooms | 48,692.040 | |
| (52,408.630) | ||
| sq_ft | 389.120*** | 325.028*** |
| (53.171) | (31.538) | |
| statusRegular | 303,964.600*** | 315,441.300*** |
| (105,942.800) | (109,749.700) | |
| statusShort Sale | -19,828.560 | 31,284.440 |
| (86,335.350) | (87,356.110) | |
| Constant | 184,130.900 | -105,820.700 |
| (143,368.800) | (118,697.400) | |
| Observations | 125 | 125 |
| R2 | 0.568 | 0.527 |
| Adjusted R2 | 0.542 | 0.507 |
| Residual Std. Error | 380,586.300 (df = 117) | 395,084.400 (df = 119) |
| F Statistic | 21.997*** (df = 7; 117) | 26.491*** (df = 5; 119) |
| Note: | p<0.1; p<0.05; p<0.01 | |
# can also open this HTML table in Word to customize it, but can also customize thru stargazer directly (might be a pain)
Now let’s check out diagnostic plots to see if our assumptions (normality of residuals and homoscedasticity) are satisfied or if we’re really concerned:
plot(home_lm2)
Make some home price predictions with new data. We’re going to use that simplified model (home_lm2).
new_df <- data.frame(city = rep(c("San Luis Obispo", "Arroyo Grande", "Atascadero"), each = 10),
sq_ft = rep(seq(0, 5000, length = 10)),
status = "Regular")
Use predict function to make new predictions using home_lm2:
predict_df <- predict(home_lm2, newdata = new_df)
full_df <- data.frame(new_df, predict_df)
Make a graph with the actual data, and what our model actually predicts:
# if pulling from two sources, start with empty ggplot() call and add the data
ggplot() +
geom_point(data = homes,
aes(x = sq_ft,
y = price,
color = city,
pch = city)) +
geom_line(data = full_df,
aes(x = sq_ft,
y = predict_df,
color = city)) +
theme_light()
ggplot(data = homes, aes(x = sq_ft, y = price)) +
geom_point() +
geom_smooth(method = lm) +
facet_wrap(~city)
sf packagesf = simple features
Has sticky geometry! Maintains the spatial information in the dataframe. This means we can just work with attributes like a normal data frame for wrangling, viz, etc.
Get the California dams data:
dams <- read_csv("ca_dams.csv") %>%
clean_names() %>%
drop_na(latitude) %>% # does complete row deletion for whatever variable you indicate with an 'na'
drop_na(longitude) %>%
drop_na(year_completed)
## Parsed with column specification:
## cols(
## .default = col_character(),
## RECORDID = col_double(),
## STATEID = col_logical(),
## LONGITUDE = col_double(),
## LATITUDE = col_double(),
## DISTANCE = col_double(),
## YEAR_COMPLETED = col_double(),
## DAM_LENGTH = col_double(),
## DAM_HEIGHT = col_double(),
## STRUCTURAL_HEIGHT = col_double(),
## HYDRAULIC_HEIGHT = col_double(),
## NID_HEIGHT = col_double(),
## MAX_DISCHARGE = col_double(),
## MAX_STORAGE = col_double(),
## NORMAL_STORAGE = col_double(),
## NID_STORAGE = col_double(),
## SURFACE_AREA = col_double(),
## DRAINAGE_AREA = col_double(),
## INSPECTION_FREQUENCY = col_double(),
## SPILLWAY_WIDTH = col_double(),
## VOLUME = col_double()
## # ... with 6 more columns
## )
## See spec(...) for full column specifications.
## Warning: 109 parsing failures.
## row col expected actual file
## 1337 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## 1338 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## 1339 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## 1340 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## 1341 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## .... ......... .................. ...... .............
## See problems(...) for more details.
Convert lat/long numbers to simple features data (sf)
dams_sf <- st_as_sf(dams, coords = c("longitude", "latitude")) # indicate where your long/lat values are
st_crs(dams_sf) <- 4326 # set coordinate reference system
plot(dams_sf) # basic plot works OK for initial review
## Warning: plotting the first 9 out of 67 attributes; use max.plot = 67 to
## plot all
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
Now let’s read in the shapefile data for the state of CA (outer boundary TIGER shapefile data).
ca_border <- read_sf(here::here("ca_state_border"), layer = "CA_State_TIGER2016")
plot(ca_border) # if you look at the table for ca_border, it holds the entire geometry of the polygon in 'geometry'
## Warning: plotting the first 9 out of 14 attributes; use max.plot = 14 to
## plot all
Now, plot dams in California over the CA outline, in ggplot:
ggplot() +
geom_sf(data = ca_border, fill = "slate grey") +
geom_sf(data = dams_sf,
aes(size = dam_height),
alpha = 0.5,
color = "dark red",
show.legend = FALSE)
Let’s make an animated plot:
ggplot() +
geom_sf(data = ca_border) +
geom_sf(data = dams_sf, size = 1, color = "gray50") +
theme_void() +
transition_time(year_completed) + # animation time; give it the variable that contains that info
shadow_mark(alpha = 1) # by default shows up & disappears, this makes sure they stay